library(tidyverse)
library(lubridate) # work with dates
library(magrittr) # more pipes
library(broom) # tidy regression outputs
library(fixest) # fixest::feols() is very fast with big data and fixed effects
library(patchwork) # for combining plots
library(ggthemes) # for ggthemes
library(knitr) # for making regression tables in Latex
library(modelsummary) # for making regression tables 

shift <- data.table::shift # for lagging/leading variables

#### Load in MP data and tidy it ####

mp <- read_csv("data/analysis/MP_panelv2.csv")

## Key variables: 
# --
# fff_event: treatment, fridays for future protest
# --
# sum_tweets: sum of tweets
# sum_ctweets: sum of climate tweets
# --
# sum_spchs: sum of speeches
# sum_cspchs: sum of climate speeches
# --

## Load in speeches data to identify parliamentary sitting days
speeches <- read_csv("data/output/speeches.csv") %>% 
  distinct(speech_date) %>% 
  rename(date = speech_date) %>% 
  mutate(parliament_sitting = 1)

##' Recode the raw MP_panel data to add more date information
#'  add protest lags and leads
#'  add different measures of protest
#'  and recode some district-level variables (party, marginality)
mp_df <- mp %>% 
  # Recode dates for dummies
  mutate(date = ymd(date)) %>% 
  mutate(year = year(date),
         month = month(date),
         week = isoweek(date), # Calendar weeks, starting on Monday
         year_month = paste(year, month, sep = "_"),
         year_week = paste(year, week, sep = "_")) %>% 
  # Group by MP
  group_by(full_name) %>% 
  # Create a "lead" FFF event indicator, for investigating anticipation effects
  do(data.frame(., setNames(shift(.$fff_event, 2:1, type = "lead"), paste0('fff_event_laglead', 2:1)))) %>% 
  # Create a copy of the FFF event variable, to match syntax in later analysis
  mutate(fff_event_lag0 = fff_event) %>% 
  # Lag FFF events, 6 lags
  do(data.frame(., setNames(shift(.$fff_event, 1:5, type = "lag"), paste0('fff_event_lag', 1:5)))) %>% 
  # Generate dummy for whether there was a FFF event in last 0, 1, 2, 3, 4, 5 days
  mutate(fff_events_cumulative_lead2 = fff_event_laglead2 + fff_event_laglead1,
         # The anticipation variable for the cumulative measure
         fff_events_cumulative_lead1 = fff_event_laglead1, 
         # For instantaneous effect
         fff_events_cumulative0 = fff_event, 
         # For an effect the day of, day after ("*_lag1")
         fff_events_cumulative1 = fff_event + fff_event_lag1, 
         # For an effect the day of, day after, and day after that ("*_lag2")
         fff_events_cumulative2 = fff_event + fff_event_lag1 + fff_event_lag2, 
         fff_events_cumulative3 = fff_event + fff_event_lag1 + fff_event_lag2 + 
           fff_event_lag3,
         fff_events_cumulative4 = fff_event + fff_event_lag1 + fff_event_lag2 +
           fff_event_lag3 + fff_event_lag4,
         fff_events_cumulative5 = fff_event + fff_event_lag1 + fff_event_lag2 +
           fff_event_lag3 + fff_event_lag4 + fff_event_lag5) %>% 
  mutate(cumulative_fff_protests_all_time = cumsum(sum_fff_events)) %>% 
  ungroup() %>% 
  # Recode some MP characteristics
  mutate(lab1_con0 = case_when(party_value=="Conservative" ~ 0,
                               party_value=="Labour" ~ 1,
                               party_value=="Labour (Co-op)" ~ 1)) %>% 
  mutate(frontbench = ifelse(category == "minister" | 
                               category == "speaker" | 
                               category == "shadow minister", 1, 0),
         frontbench = ifelse(is.na(category), 0, frontbench)) %>%
  # Remove MPs with speaker roles from the analyses
  filter(full_name!="Dame Eleanor Laing",
         full_name!="Sir Lindsay Hoyle") %>%
  # Merge in the sitting days data
  full_join(., speeches, by = "date") %>% 
  mutate(parliament_sitting = ifelse(is.na(parliament_sitting), 0, 1)) %>% 
  # Count the number of sitting days to do, to use as groups
  group_by(full_name) %>% 
  mutate(tally_sitting = cumsum(parliament_sitting),
         # And shift the count to match the weeks
         tally_sitting = ifelse(parliament_sitting == 1, tally_sitting-1, tally_sitting)) %>% 
  filter(!is.na(full_name)) %>% 
  # Make climate outcomes binary, for robustness
  mutate(sum_ctweets_binary = if_else(sum_ctweets >= 1, 1, 0),
         sum_cspchs_binary = if_else(sum_cspchs >= 1, 1, 0)) %>%
  ungroup() %>% 
  as.data.frame()

saveRDS(mp_df, "data/analysis/mp_df.rds")


#### Direct effect of protest on tweets ####

## Run fixed effects regression models predicting tweets as a function of protest
# Fixest with stepwise and cumulative stepwise regressors

## Hypothesis 1: Direct effect of protest on online speech
baseline1 <- feols(sum_ctweets ~ fff_events_cumulative1 + sum_tweets |
              full_name + year_month,
            data = mp_df)
baseline2 <- feols(sum_ctweets ~ fff_events_cumulative1 + sum_tweets + frontbench |
              full_name + year_month,
            data = mp_df)
baseline3 <- feols(sum_ctweets ~ fff_events_cumulative1 + sum_tweets |
              full_name + year_week,
            data = mp_df)
baseline4 <- feols(sum_ctweets ~ fff_events_cumulative1 + sum_tweets + frontbench |
              full_name + year_week,
            data = mp_df)

## Baseline effects table

# Settings
gm <- tibble::tribble(
  ~raw,        ~clean,          ~fmt,
  # "r.squared", "R2",            2,
  "nobs",      "Observations",             0,
  "FE: full_name",  "MP fixed effect",      0,
  "FE: year_month", "Year-month fixed effect", 0,
  "FE: year_week", "Year-week fixed effect", 0)

# Table 1
modelsummary(models = list(baseline1, baseline2, baseline3, baseline4), 
             output = "latex",
             coef_rename = c(
               "fff_events_cumulative1" = "FFF Protest$_{t:t-1}$",
               "sum_tweets" = "Tweets (daily sum)",
               "frontbench" = "Frontbench"),
             gof_map = gm, 
             notes = list('Outcome is count of climate tweets',
                          'Standard errors clustered by MP'),
             title = 'Direct effect of Fridays for Future protest on MPs’ tweets',
             stars = c("+" = 0.1, "*" = 0.05, "**" = 0.01))
## -> copy Latex output into manuscript


#### Evaluate tweets and speeches with different lag structures ####

# First, in different windows: "cumulative" codings of the protest variable
# sw() argument: evaluates each of the variables in the brackets individually, so creates that many model in a list
lags_tweets <- feols(sum_ctweets ~ sum_tweets + 
                    sw(fff_events_cumulative_lead2, fff_events_cumulative_lead1,
                       fff_events_cumulative0, 
                       fff_events_cumulative1, fff_events_cumulative2, 
                       fff_events_cumulative3, fff_events_cumulative4,
                       fff_events_cumulative5) |
                    full_name + year_week,
                  data = mp_df)

## Collect the model outputs from each step sw() in the fixest::feols models

sw_steps <- seq_along(select(mp_df, starts_with("fff_events_cumulative")))
outs_lags_tweets <- as.data.frame(matrix(NA))

## Bind all the fixest_1 model outputs together using loops
for (step in sw_steps){
  int_lags_tweets <- tidy(lags_tweets[[step]]) %>%   
    filter(str_detect(term, "fff")) %>% 
    mutate(step_number = step)
  outs_lags_tweets <- bind_rows(outs_lags_tweets, int_lags_tweets)
}

# And tidy
outs_lags_tweets <- outs_lags_tweets %>% 
  select(-V1) %>% 
  slice(2:nrow(.)) %>% 
  set_colnames(c("term", "estimate", "std_error", "t_statistic", "p_value", "step_number")) %>% 
  mutate(days = as.integer(str_sub(term, start = -1L, end = -1L))) %>% 
  mutate(days = ifelse(str_detect(term, "lead"), -1*days, days)) %>% 
  mutate(model = "window")

## Speech models 

## First, the most basic speech model to match the syntax of the Twitter models
lags_speeches <- feols(sum_cspchs ~ sum_spchs +
                    sw(fff_events_cumulative_lead2, fff_events_cumulative_lead1,
                       fff_events_cumulative0, 
                       fff_events_cumulative1, fff_events_cumulative2, 
                       fff_events_cumulative3, fff_events_cumulative4,
                       fff_events_cumulative5) |
                    full_name + year_week,
                  data = mp_df)

outs_lags_speeches <- as.data.frame(matrix(NA))

## Bind all the fixest_1 model outputs together
for (step in sw_steps){
  int_lags_speeches <- tidy(lags_speeches[[step]]) %>%   
    filter(str_detect(term, "fff")) %>% 
    mutate(step_number = step)
  outs_lags_speeches <- bind_rows(outs_lags_speeches, int_lags_speeches)
}

# And tidy
outs_lags_speeches <- outs_lags_speeches %>% 
  select(-V1) %>% 
  slice(2:nrow(.)) %>% 
  set_colnames(c("term", "estimate", "std_error", "t_statistic", "p_value", "step_number")) %>% 
  mutate(days = as.integer(str_sub(term, start = -1L, end = -1L))) %>% 
  mutate(days = ifelse(str_detect(term, "lead"), -1*days, days)) %>% 
  mutate(model = "window")

## Plot coefficients for tweets and speeches side-by-side
figure_tweets_speeches <- rbind(outs_lags_tweets, outs_lags_speeches) %>% 
  mutate(ci_lower = estimate-1.96*std_error,
         ci_upper = estimate+1.96*std_error) %>% 
  mutate(outcome = rep(c("Online", "Offline"), each = 8)) %>% 
  ggplot(., aes(days, estimate, color = outcome, group = outcome, shape = outcome)) +
  geom_hline(yintercept = 0, color = "black", linetype = 2) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
                 position = position_dodge(width = 0.5), size=0.75) +
  geom_point(aes(shape = outcome), 
             position = position_dodge(width = 0.5), size = 3) +
  scale_shape_manual(values = c(19,18)) +
  scale_x_continuous(breaks = seq(-2, 5, 1),
                     labels = c("-2\nW", "-1\nTh", 
                                "1\nFriday", "2\nF:Sa", 
                                "3\nF:Su", "4\nF:M", 
                                "5\nF:Tu",
                                "6\nF:W")) +
  scale_y_continuous(breaks = seq(-0.05, .30, .05)) +
  scale_color_manual(values=c("#79b321","#344d0e")) +
  labs(x = "Days in local FFF protest window", 
       colour = "Outcome",
       shape = "Outcome",
       y = "Estimate and 95% confidence interval") +
  theme_tufte(base_family = "Arial") +
  theme(legend.position = "bottom",
        legend.text = element_text(size=15),
        legend.title = element_text(size=15),
        axis.title.x = element_text(size=15),
        axis.text.x = element_text(size=15),
        axis.title.y = element_text(size=15),
        axis.text.y = element_text(size=15),
        panel.border = element_rect(colour = "black", fill=NA, size=1),
        # panel.grid.major = element_line(size=.075),
        panel.grid.major = element_blank())
figure_tweets_speeches
ggsave("plots/figure3.png", height = 6, width = 8, dpi = 600)


#### Other structures for time ####

## Next, consider a set of models where time is treated differently
#' Want to be matching the opportunity structure that MPs have for speaking
#' Three models:
#' Daily-level; weekly-level; sitting days

# A first set of models as in the baseline model: tweets and speeches
daily_tweets <- feols(sum_ctweets ~ sum_tweets + frontbench +
                           fff_events_cumulative1 |
                           full_name + year_week,
                         data = mp_df)
daily_speeches <- feols(sum_cspchs ~ sum_spchs + frontbench +
                          fff_events_cumulative1 |
                          full_name + year_week,
                        data = mp_df)


## Second, consider a model where the daily speech data is collapsed to the week level

mp_df_week <- mp_df %>%
  select(date, year_month, year_week, parliament_sitting,
         full_name, lab1_con0, frontbench,
         sum_tweets, sum_ctweets,
         sum_spchs, sum_cspchs,
         fff_event) %>%
  # Create weeks starting on Friday to aggregate with
  mutate(week_friday = week(round_date(date, "week", week_start = 1)),
         year_week_friday = paste(year(date), week_friday, sep = "_")) %>%
  relocate(c(week_friday, year_week_friday), .after = year_week) %>% 
  group_by(full_name, year_week_friday) %>% 
  mutate(parliament_sitting_week = max(parliament_sitting, na.rm=T),
         week_sum_spchs = sum(sum_spchs, na.rm=T),
         week_sum_cspchs = sum(sum_cspchs, na.rm=T),
         week_sum_tweets = sum(sum_tweets, na.rm=T),
         week_sum_ctweets = sum(sum_ctweets, na.rm=T),
         week_fff_event = sum(fff_event),
         week_frontbench = max(frontbench, na.rm=T)) %>% 
  ungroup() %>% 
  # And aggregate to the weekly level
  select(year_month, year_week_friday, parliament_sitting_week, 
         full_name, lab1_con0, week_frontbench,
         starts_with("week")) %>% 
  ## NB. Should be about 85,000 observations, if not check that year_week and year_week_friday aren't both included with distinct()
  distinct() %>% 
  # And create a lagged protest variable
  group_by(full_name) %>% 
  mutate(week_fff_event_lag1 = shift(week_fff_event, type = "lag", n = 1)) %>% 
  # Remove rows with NAs in the treatment variable induced by lags/leads
  filter(!is.na(week_fff_event_lag1))

# Tweets at the week-level
weekly_tweets <- feols(week_sum_ctweets ~ week_sum_tweets + week_frontbench + 
                           week_fff_event | # Don't need to lag the week because of how the weeks are defined
                           full_name + year_week_friday, 
                         data = mp_df_week)

# Speeches at the week-level
weekly_speeches <- feols(week_sum_cspchs ~ week_sum_spchs + week_frontbench +
                            week_fff_event |
                            full_name + year_week_friday,
                          data = mp_df_week)

## Third, consider the day to the next parliamentary sitting instead of the week

mp_df_sitting <- mp_df %>% 
  # Group by the cumulative number of days that parliament has been sitting since session began
  group_by(tally_sitting) %>% 
  # Calculate characteristics of that sitting window
  mutate(min_window_year = min(year),
         min_window_month = min(month),
         min_window_week = min(week),
         window_length = as.numeric(max(ymd(date)) - min(ymd(date)))) %>% 
  ungroup() %>%  
  # Calculate whether an MP spoke/tweeted during that interval
  group_by(full_name, tally_sitting) %>% 
  mutate(sum_tweets_window = sum(sum_tweets),
         sum_ctweets_window = sum(sum_ctweets),
         sum_spchs_window = sum(sum_spchs),
         sum_cspchs_window = sum(sum_cspchs)) %>% 
  # Calculate whether there were protests in that district during that interval
  mutate(sum_fff_events_window = sum(fff_event)) %>% 
  # Keep key variables
  select(date, parliament_sitting, tally_sitting,
         full_name, frontbench,
         starts_with("min_"), window_length, ends_with("_window")) %>% 
  ungroup() %>% 
  # Collapse data to the "parliament--sitting day period" --- a unit of opportunity
  distinct(tally_sitting, full_name, .keep_all=T) %>% 
  # Lag protest measure
  group_by(full_name) %>% 
  mutate(sum_fff_events_window_lag1 = shift(sum_fff_events_window, type = "lag", n = 1))

## Estimate the effect of climate protests on speaking in the next parliamentary sitting day

# Tweets at the sitting--day-level
sitting_tweets <- feols(sum_ctweets_window ~ sum_tweets_window + 
                           frontbench + window_length +
                           sum_fff_events_window_lag1 |
                           full_name + min_window_year^min_window_week, # Combine year and week into a fixed effect
                         data = mp_df_sitting)

# Speech at the sitting--day-level
sitting_speeches <- feols(sum_cspchs_window ~ sum_spchs_window + 
                             frontbench + window_length +
                             sum_fff_events_window_lag1 |
                             full_name + min_window_year^min_window_week, # Combine year and week into a fixed effect
                           data = mp_df_sitting)

## Put these together in a table
table_speech <- bind_rows(daily_tweets %>% tidy() %>% mutate(model = 5),
                          daily_speeches %>% tidy() %>% mutate(model = 6),
                          weekly_tweets %>% tidy() %>% mutate(model = 7),
                          weekly_speeches %>% tidy() %>% mutate(model = 8),
                          sitting_tweets %>% tidy() %>% mutate(model = 9), 
                          sitting_speeches %>% tidy() %>% mutate(model = 10)) %>% 
  set_colnames(c("term", "estimate", "std_error", "t_statistic", "p_value", "model")) %>% 
  filter(str_detect(term, "fff")) %>%
  mutate(covariates = "Yes") %>% 
  mutate(outcome = rep(c("Tweets", "Speeches"), times = 3)) %>% 
  mutate(unit_observation = rep(c("Daily", "Weekly", "Sitting days"), 
                                each = 2)) %>% 
  mutate(fe1 = c(daily_tweets$fixef_vars[[1]],
                 daily_speeches$fixef_vars[[1]],
                 weekly_tweets$fixef_vars[[1]],
                 weekly_speeches$fixef_vars[[1]],
                 sitting_tweets$fixef_vars[[1]],
                 sitting_speeches$fixef_vars[[1]])) %>% 
  mutate(fe2 = c(daily_tweets$fixef_vars[[2]],
                 daily_speeches$fixef_vars[[2]],
                 weekly_tweets$fixef_vars[[2]],
                 weekly_speeches$fixef_vars[[2]],
                 sitting_tweets$fixef_vars[[2]],
                 sitting_speeches$fixef_vars[[2]])) %>% 
  mutate(nobs = c(daily_tweets$nobs,
                  daily_speeches$nobs,
                  weekly_tweets$nobs,
                  weekly_speeches$nobs,
                  sitting_tweets$nobs,
                  sitting_speeches$nobs)) %>% 
  mutate_at(vars(estimate:std_error), ~format(round(., digits = 3), 
                                              nsmall = 3)) %>% 
  mutate_all(~as.character(.)) %>% 
  pivot_longer(names_to = "param", values_to = "value", 
               c(estimate:p_value, covariates:nobs)) %>% 
  mutate(value = ifelse(param == "std_error", 
                        paste("(", value, ")", sep = ""), value)) %>% 
  mutate(value = ifelse(param == "fe1" & value == "full_name", 
                        "MP", value)) %>%
  mutate(value = ifelse(param == "fe2" & value == "year_week", 
                        "Year--week", value)) %>% 
  mutate(value = ifelse(param == "fe2" & value == "year_week_friday", 
                        "Year--week", value)) %>% 
  mutate(value = ifelse(param == "fe2" & value == "min_window_year^min_window_week", 
                        "Year--week", value)) %>% 
  group_by(model) %>% 
  mutate(p_value = as.numeric(max(ifelse(param=="p_value", value, 0)))) %>% 
  mutate(value = ifelse(param == "estimate" & (p_value < 0.05 & p_value > 0.01),
                        paste(value, "*", sep = ""),
                        ifelse(param == "estimate" & (p_value < 0.01),
                               paste(value, "**", sep = ""),
                               value))) %>% 
  select(-p_value) %>% 
  ungroup() %>% 
  filter(param != "t_statistic", 
         param != "p_value") %>% 
  mutate(term = "fff_protest") %>% 
  pivot_wider(names_from = "model", values_from = "value", 
              names_prefix = "M") %>% 
  mutate(term = case_when(param == "estimate" ~ "FFF Protest",
                          param == "std_error" ~ "",
                          param == "covariates" ~ "Covariates",
                          param == "outcome" ~ "Outcome",
                          param == "unit_observation" ~ "Unit of observation",
                          param == "fe1" ~ "Unit fixed effect",
                          param == "fe2" ~ "Time fixed effect",
                          param == "nobs" ~ "Observations")) %>% 
  filter(term!="Outcome") %>% 
  select(-param) %>% 
  knitr::kable(., 
               format = "latex",
               booktabs = T,
               linesep = "")
table_speech

#### Full regression tables ####

gm <- tibble::tribble(
  ~raw,        ~clean,          ~fmt,
  # "r.squared", "R2",            2,
  "nobs",      "Observations",             0,
  "FE: full_name",  "MP fixed effect",      0,
  "FE: year_month", "Year-month fixed effect", 0,
  "FE: year_week", "Year-week fixed effect", 0,
  "FE: year_week_friday", "Year-week fixed effect", 0,
  "FE: min_window_year^min_window_week", "Year-week fixed effect", 0)

# Table APP-9
full_tweets_latex <- modelsummary(models = lags_tweets,
             output = "latex",
             gof_map = gm,
             title = 'Full regression table for online tweets outcome in figure 3',
             stars = c("*" = 0.05, "**" = 0.01),
             coef_rename = c(
               "sum_tweets" = "Sum tweets",
               "fff_events_cumulative_lead2" = "FFF (day of:lead 2)",
               "fff_events_cumulative_lead1" = "FFF (day of:lead 1)",
               "fff_events_cumulative0" =  "FFF (day of)",
               "fff_events_cumulative1" = "FFF (day of:lag 1)",
               "fff_events_cumulative2" = "FFF (day of:lag 2)",
               "fff_events_cumulative3" = "FFF (day of:lag 3)",
               "fff_events_cumulative4" = "FFF (day of:lag 4)",
               "fff_events_cumulative5" = "FFF (day of:lag 5)"))
full_tweets_latex

# Table APP-10
full_speeches_latex <- modelsummary(models = lags_speeches,
             output = "latex",
             gof_map = gm,
             title = 'Full regression table for offline speeches outcome in figure 3',
             stars = c("*" = 0.05, "**" = 0.01),
             coef_rename = c(
               "sum_spchs" = "Sum speeches",
               "fff_events_cumulative_lead2" = "FFF (day of:lead 2)",
               "fff_events_cumulative_lead1" = "FFF (day of:lead 1)",
               "fff_events_cumulative0" =  "FFF (day of)",
               "fff_events_cumulative1" = "FFF (day of:lag 1)",
               "fff_events_cumulative2" = "FFF (day of:lag 2)",
               "fff_events_cumulative3" = "FFF (day of:lag 3)",
               "fff_events_cumulative4" = "FFF (day of:lag 4)",
               "fff_events_cumulative5" = "FFF (day of:lag 5)"))
full_speeches_latex

# Table APP-11
full_speech_table_latex <- modelsummary(models = list(daily_tweets, daily_speeches,
                           weekly_tweets, weekly_speeches,
                           sitting_tweets, sitting_speeches),
             output = "latex",
             gof_map = gm,
             title = 'Full regression table for offline speeches outcome in figure 3',
             stars = c("*" = 0.05, "**" = 0.01),
             coef_rename = c(
               "sum_spchs" = "Sum speeches",
               "sum_tweets" = "Sum tweets",
               "frontbench" = "Frontbench",
               "fff_events_cumulative1" = "FFF (window: t:t+1)",
               "week_sum_tweets" = "Sum tweets (weekly)",
               "week_frontbench" = "Frontbench (week)",
               "week_sum_spchs" = "Sum speeches (weekly)",
               "week_fff_event" = "FFF events (weekly)",
               "sum_tweets_window" = "Sum tweets (window)",
               "window_length" = "Window length",
               "sum_spchs_window" = "Sum speeches (window)",
               "sum_fff_events_window_lag1" = "Sum FFF events (window, lag 1)"))
full_speech_table_latex